The goal of this analysis is to assess the SMP population with CWL
monitoring data, categorize them and find the underrepresented smp
features in our data set. This is to ensure our monitoring effort
results in a more comprehensive data set that covers all types of SMPs
with varying design metrics such as loading ratios. The output of this
analysis can assist the data collection team with their future site
selection process.
#Libraries
library(odbc)
library(DBI)
library(lubridate)
library(tidyverse)
library(stats)
library(gridExtra)
library(grid)
library(gtable)
library(ggtext)
library(dplyr)
library(ggplot2)
library(knitr)
library(plotly)
library(cluster)
library(factoextra)
# DB PG14
con <- dbConnect(odbc::odbc(), dsn = "mars14_data", uid = Sys.getenv("shiny_uid"), pwd = Sys.getenv("shiny_pwd"), MaxLongVarcharSize = 8190)
#Getting list of SMPs
cwl_smp <- dbGetQuery(con,"SELECT DISTINCT smp_id
FROM fieldwork.viw_deployment_full_cwl")
#Getting the greenit info
smpbdv <- dbGetQuery(con,"SELECT * FROM external.tbl_smpbdv")
sysbdv <- dbGetQuery(con,"SELECT * FROM external.tbl_systembdv")
greenit_unified <-dbGetQuery(con,"SELECT * FROM external.viw_greenit_unified")
# join the greenit table with our smp list
smp_features <- cwl_smp %>%
inner_join(smpbdv, by="smp_id")
# Unmonitored SMPs
unmonitored_smp <- read.csv("C:\\Users\\Farshad.Ebrahimi\\OneDrive - City of Philadelphia\\Github Projects\\smp-population-assesment\\unmonitored_sites.csv") %>%
select(smp_id = SMP.ID) %>%
distinct()
# unmonitored smp_features
unmonitored_smp_features <- unmonitored_smp %>%
inner_join(smpbdv, by="smp_id")
The following code chunk calculates the break-down of SMP types with
CWL data.
#number of smp types
smp_type_break <- smp_features %>%
group_by(smp_smptype) %>%
summarise(count = n()) %>%
select(Type = smp_smptype, count)

#number of unmonitored smps
unmonitored_smp_type_break <- unmonitored_smp_features %>%
group_by(smp_smptype) %>%
summarise(count = n()) %>%
select(Type = smp_smptype, count)
The following Pie chart shows the break-down of all SMPs (monitored
and unmonitored).


#combining all smps (monitored and unmonitored) and get fractions based on type of smp-finally normalize them
total_smp <- unmonitored_smp_type_break %>%
full_join(smp_type_break, by="Type")
total_smp[is.na(total_smp)] <- 0
total_smp <- total_smp %>%
mutate(total_number = count.x+count.y) %>%
select(Type, total_number)
total_smp <- total_smp %>%
mutate(sum_all = sum(total_number)) %>%
mutate(percent_all = (total_number/sum_all)*100)
#get the percent of monitored SMP
smp_type_break_fraction <- smp_type_break %>%
mutate(sum_all = sum(count)) %>%
mutate(percent_monitored = (count/sum_all)*100)
#normalize the monitored smp by the total smp percents
normalized_fractions <- smp_type_break_fraction %>%
full_join(total_smp, by="Type") %>%
mutate(normalized_percent = percent_monitored/percent_all) %>%
select(Type, percent_monitored, percent_all, normalized_percent)
normalized_fractions[is.na(normalized_fractions)] <- 0
The following table is aimed to identify the over-representing
(normalized percent > 1) and under-representing (normalized percent
<1) SMP types.
#calculating the normalized percentage of smp types
kable(arrange(normalized_fractions, normalized_percent), caption = "Normalized SMP Break-down")
Normalized SMP Break-down
| Green Roof |
0.0000000 |
0.1490313 |
0.0000000 |
| Stormwater Tree |
0.0000000 |
4.1728763 |
0.0000000 |
| Bumpout |
2.5862069 |
8.4947839 |
0.3044465 |
| Swale |
1.2931034 |
2.5335320 |
0.5103955 |
| Planter |
5.8189655 |
7.5260805 |
0.7731734 |
| Infiltration/Storage Trench |
26.0775862 |
29.5081967 |
0.8837404 |
| Basin |
0.2155172 |
0.2235469 |
0.9640805 |
| Rain Garden |
12.0689655 |
11.1028316 |
1.0870169 |
| Tree Trench |
50.6465517 |
35.6929955 |
1.4189493 |
| Pervious Paving |
0.4310345 |
0.2980626 |
1.4461207 |
| Drainage Well |
0.8620690 |
0.2980626 |
2.8922414 |
The table above shows Tree trenches are over-represented in our CWL
data, while green roof, stormwater tree, bumpout, swale and planters are
under-represented (due to monitoring limitation, etc).
The following section is focused on establishing the design metrics
range for SMPs in the unmonitored SMP list. These design metrics include
drainage area, storm size managed and loading ratio.
#rawstormsizemanaged-in , sys_impervda_ft2, and loading ratio metrics gathering
table_all <- unmonitored_smp_features %>%
left_join(sysbdv, by="system_id") %>%
mutate(loading_ratio = sys_lrtotalda_ft2 )
#Clustring analysis based on 3 features of drainage area, loading ratio, and storm sized managed
clustered_moniored <- table_all %>%
select(system_id, sys_impervda_ft2, loading_ratio, sys_rawstormsizemanaged_in) %>%
na.omit() %>%
distinct()
#normalze using mean-sd standardization
normalized_cluster <- clustered_moniored
normalized_cluster[,2:4] <- scale(clustered_moniored[,2:4])
#Elbow method shows a subtle elbow at cluster 4-hence 4 clusters is optimum
#number of clusters
fviz_nbclust(normalized_cluster[,2:4], kmeans, method = "wss")

#k-means
# Cluster using kmeans with five clusters
cluster_solution <- kmeans(normalized_cluster[,2:4], centers = 4)
# Store the cluster assignments back into the clustering data frame object
clustered_moniored$cluster <-factor(cluster_solution$cluster)
# Look at the distribution of cluster assignments
table(clustered_moniored$cluster)
1 2 3 4
21 279 136 11
# Group by the cluster assignment and calculate averages
sys_clus_avg <- clustered_moniored %>%
group_by(cluster) %>%
summarize_if(is.numeric, mean)
# View the resulting table
kable(sys_clus_avg)
| 1 |
17695.05 |
68.36332 |
1.491118 |
| 2 |
20363.52 |
12.75067 |
1.886491 |
| 3 |
17350.55 |
23.52892 |
1.212609 |
| 4 |
62204.82 |
10.56821 |
3.843215 |
ggplot(data = clustered_moniored, aes(x = loading_ratio, y = sys_rawstormsizemanaged_in, color = cluster)) +
geom_point()

plot_ly(clustered_moniored, x = ~sys_impervda_ft2, y = ~loading_ratio, z = ~sys_rawstormsizemanaged_in) %>%
add_markers(color = ~cluster)
In order to have a more relevant clustering method that distinguishes
between general types of systems (lined vs unlined or bioretention vs
bio infiltration), the unmonitored SMPs were devided into broad
categories of Bioinfiltration, Bioretention (lined), Bioretention
(unlined), Subsurface infiltration, Subsurface slow release (lined), and
Subsurface slow release (unlined). This is done by a system-level
feature called sys_modelinputcategory. The code chunk below shows a
break down of these SMPs. Please note that green roof and permeable
pavements are not included due to the low number.
#sys_modelinputcategory categories
sys_cat <- table_all %>%
select(system_id, sys_modelinputcategory)%>%
distinct %>%
group_by(sys_modelinputcategory) %>%
summarise(count = n())
kable(sys_cat)
| Bioinfiltration |
124 |
| Bioretention (lined) |
29 |
| Bioretention (unlined) |
83 |
| Green Roof |
2 |
| Permeable Pavement |
1 |
| Subsurface infiltration |
159 |
| Subsurface slow release (lined) |
113 |
| Subsurface slow release (unlined) |
113 |
clustered_Bioinfiltration <- table_all %>%
filter(sys_modelinputcategory == "Bioinfiltration") %>%
select(system_id, sys_impervda_ft2, loading_ratio, sys_rawstormsizemanaged_in, sys_modelinputcategory) %>%
na.omit() %>%
distinct()
#normalze using mean-sd standardization
normalized_cluster <- clustered_Bioinfiltration
normalized_cluster[,2:4] <- scale(clustered_Bioinfiltration[,2:4])
#Elbow method shows a subtle elbow at cluster 4-hence 4 clusters is optimum
#number of clusters
fviz_nbclust(normalized_cluster[,2:4], kmeans, method = "wss")

#k-means
# Cluster using kmeans with five clusters
cluster_solution <- kmeans(normalized_cluster[,2:4], centers = 4)
# Store the cluster assignments back into the clustering data frame object
clustered_Bioinfiltration$cluster <-factor(cluster_solution$cluster)
# Look at the distribution of cluster assignments
table(clustered_Bioinfiltration$cluster)
1 2 3 4
17 31 72 4
# Group by the cluster assignment and calculate averages
sys_clus_avg <- clustered_Bioinfiltration %>%
group_by(cluster) %>%
summarize_if(is.numeric, mean)
# View the resulting table
kable(sys_clus_avg)
| 1 |
11184.24 |
40.58019 |
1.063568 |
| 2 |
51515.00 |
15.79839 |
1.752800 |
| 3 |
15718.52 |
12.39893 |
1.778769 |
| 4 |
22358.50 |
10.18006 |
4.348037 |
#3 D plot
plot_ly(clustered_Bioinfiltration, x = ~sys_impervda_ft2, y = ~loading_ratio, z = ~sys_rawstormsizemanaged_in) %>%
add_markers(color = ~cluster)
Bioretention (lined) clustering. Note that loading ratio field is not
applicable here.
clustered_Bioreten_lined <- table_all %>%
filter(sys_modelinputcategory == "Bioretention (lined)") %>%
select(system_id, sys_impervda_ft2, sys_rawstormsizemanaged_in, sys_modelinputcategory) %>%
na.omit() %>%
distinct()
#normalze using mean-sd standardization
normalized_cluster <- clustered_Bioreten_lined
normalized_cluster[,2:3] <- scale(clustered_Bioreten_lined[,2:3])
#Elbow method shows a subtle elbow at cluster 4-hence 4 clusters is optimum
#number of clusters
fviz_nbclust(normalized_cluster[,2:3], kmeans, method = "wss")

#k-means
# Cluster using kmeans with five clusters
cluster_solution <- kmeans(normalized_cluster[,2:3], centers = 2)
# Store the cluster assignments back into the clustering data frame object
clustered_Bioreten_lined$cluster <-factor(cluster_solution$cluster)
# Look at the distribution of cluster assignments
table(clustered_Bioreten_lined$cluster)
1 2
5 24
# Group by the cluster assignment and calculate averages
sys_clus_avg <- clustered_Bioreten_lined %>%
group_by(cluster) %>%
summarize_if(is.numeric, mean)
# View the resulting table
kable(sys_clus_avg)
| 1 |
73919.40 |
2.270016 |
| 2 |
18291.83 |
1.473926 |
#3 D plot
plot_ly(clustered_Bioreten_lined, x = ~sys_impervda_ft2, y = ~sys_rawstormsizemanaged_in) %>%
add_markers(color = ~cluster)
Warning in RColorBrewer::brewer.pal(N, "Set2") :
minimal value for n is 3, returning requested palette with 3 different levels
Warning in RColorBrewer::brewer.pal(N, "Set2") :
minimal value for n is 3, returning requested palette with 3 different levels
Warning in RColorBrewer::brewer.pal(N, "Set2") :
minimal value for n is 3, returning requested palette with 3 different levels
Warning in RColorBrewer::brewer.pal(N, "Set2") :
minimal value for n is 3, returning requested palette with 3 different levels
Bioretention (unlined) clustering.
clustered_Bioreten_unlined <- table_all %>%
filter(sys_modelinputcategory == "Bioretention (unlined)") %>%
select(system_id, sys_impervda_ft2, sys_rawstormsizemanaged_in, loading_ratio, sys_modelinputcategory) %>%
na.omit() %>%
distinct()
#normalze using mean-sd standardization
normalized_cluster <- clustered_Bioreten_unlined
normalized_cluster[,2:4] <- scale(clustered_Bioreten_unlined[,2:4])
#Elbow method shows a subtle elbow at cluster 4-hence 4 clusters is optimum
#number of clusters
fviz_nbclust(normalized_cluster[,2:4], kmeans, method = "wss")

#k-means
# Cluster using kmeans with five clusters
cluster_solution <- kmeans(normalized_cluster[,2:4], centers = 3)
# Store the cluster assignments back into the clustering data frame object
clustered_Bioreten_unlined$cluster <-factor(cluster_solution$cluster)
# Look at the distribution of cluster assignments
table(clustered_Bioreten_unlined$cluster)
1 2 3
9 21 53
# Group by the cluster assignment and calculate averages
sys_clus_avg <- clustered_Bioreten_unlined %>%
group_by(cluster) %>%
summarize_if(is.numeric, mean)
# View the resulting table
kable(sys_clus_avg)
| 1 |
7046.333 |
3.309078 |
4.263439 |
| 2 |
42812.095 |
1.525561 |
20.624402 |
| 3 |
16789.575 |
1.639162 |
12.959101 |
#3 D plot
plot_ly(clustered_Bioreten_unlined, x = ~sys_impervda_ft2, y = ~sys_rawstormsizemanaged_in, z= ~loading_ratio) %>%
add_markers(color = ~cluster)
clustered_subsurface <- table_all %>%
filter(sys_modelinputcategory == "Subsurface infiltration") %>%
select(system_id, sys_impervda_ft2, sys_rawstormsizemanaged_in, loading_ratio, sys_modelinputcategory) %>%
na.omit() %>%
distinct()
#normalze using mean-sd standardization
normalized_cluster <- clustered_subsurface
normalized_cluster[,2:4] <- scale(clustered_subsurface[,2:4])
#Elbow method shows a subtle elbow at cluster 4-hence 4 clusters is optimum
#number of clusters
fviz_nbclust(normalized_cluster[,2:4], kmeans, method = "wss")

#k-means
# Cluster using kmeans with five clusters
cluster_solution <- kmeans(normalized_cluster[,2:4], centers = 2)
# Store the cluster assignments back into the clustering data frame object
clustered_subsurface$cluster <-factor(cluster_solution$cluster)
# Look at the distribution of cluster assignments
table(clustered_subsurface$cluster)
1 2
99 26
# Group by the cluster assignment and calculate averages
sys_clus_avg <- clustered_subsurface %>%
group_by(cluster) %>%
summarize_if(is.numeric, mean)
# View the resulting table
kable(sys_clus_avg)
| 1 |
19525.382 |
1.837962 |
12.69806 |
| 2 |
7850.692 |
1.030724 |
31.54683 |
#3 D plot
plot_ly(clustered_subsurface, x = ~sys_impervda_ft2, y = ~sys_rawstormsizemanaged_in, z= ~loading_ratio) %>%
add_markers(color = ~cluster)
Warning in RColorBrewer::brewer.pal(N, "Set2") :
minimal value for n is 3, returning requested palette with 3 different levels
Warning in RColorBrewer::brewer.pal(N, "Set2") :
minimal value for n is 3, returning requested palette with 3 different levels
Warning in RColorBrewer::brewer.pal(N, "Set2") :
minimal value for n is 3, returning requested palette with 3 different levels
Warning in RColorBrewer::brewer.pal(N, "Set2") :
minimal value for n is 3, returning requested palette with 3 different levels
clustered_slowrel_lined <- table_all %>%
filter(sys_modelinputcategory == "Subsurface slow release (lined)") %>%
select(system_id, sys_impervda_ft2, sys_rawstormsizemanaged_in, sys_modelinputcategory) %>%
na.omit() %>%
distinct()
#normalze using mean-sd standardization
normalized_cluster <- clustered_slowrel_lined
normalized_cluster[,2:3] <- scale(clustered_slowrel_lined[,2:3])
#Elbow method shows a subtle elbow at cluster 4-hence 4 clusters is optimum
#number of clusters
fviz_nbclust(normalized_cluster[,2:3], kmeans, method = "wss")

#k-means
# Cluster using kmeans with five clusters
cluster_solution <- kmeans(normalized_cluster[,2:3], centers = 2)
# Store the cluster assignments back into the clustering data frame object
clustered_slowrel_lined$cluster <-factor(cluster_solution$cluster)
# Look at the distribution of cluster assignments
table(clustered_slowrel_lined$cluster)
1 2
58 55
# Group by the cluster assignment and calculate averages
sys_clus_avg <- clustered_slowrel_lined %>%
group_by(cluster) %>%
summarize_if(is.numeric, mean)
# View the resulting table
kable(sys_clus_avg)
| 1 |
13139.92 |
1.195020 |
| 2 |
20312.98 |
1.720648 |
#3 D plot
plot_ly(clustered_slowrel_lined, x = ~sys_impervda_ft2, y = ~sys_rawstormsizemanaged_in) %>%
add_markers(color = ~cluster)
Warning in RColorBrewer::brewer.pal(N, "Set2") :
minimal value for n is 3, returning requested palette with 3 different levels
Warning in RColorBrewer::brewer.pal(N, "Set2") :
minimal value for n is 3, returning requested palette with 3 different levels
Warning in RColorBrewer::brewer.pal(N, "Set2") :
minimal value for n is 3, returning requested palette with 3 different levels
Warning in RColorBrewer::brewer.pal(N, "Set2") :
minimal value for n is 3, returning requested palette with 3 different levels
clustered_slowrel_unlined <- table_all %>%
filter(sys_modelinputcategory == "Subsurface slow release (unlined)") %>%
select(system_id, sys_impervda_ft2, sys_rawstormsizemanaged_in, loading_ratio, sys_modelinputcategory) %>%
na.omit() %>%
distinct()
#normalze using mean-sd standardization
normalized_cluster <- clustered_slowrel_unlined
normalized_cluster[,2:3] <- scale(clustered_slowrel_unlined[,2:3])
#Elbow method shows a subtle elbow at cluster 4-hence 4 clusters is optimum
#number of clusters
fviz_nbclust(normalized_cluster[,2:3], kmeans, method = "wss")

#k-means
# Cluster using kmeans with five clusters
cluster_solution <- kmeans(normalized_cluster[,2:3], centers = 2)
# Store the cluster assignments back into the clustering data frame object
clustered_slowrel_unlined$cluster <-factor(cluster_solution$cluster)
# Look at the distribution of cluster assignments
table(clustered_slowrel_unlined$cluster)
1 2
61 52
# Group by the cluster assignment and calculate averages
sys_clus_avg <- clustered_slowrel_unlined %>%
group_by(cluster) %>%
summarize_if(is.numeric, mean)
# View the resulting table
kable(sys_clus_avg)
| 1 |
18886.52 |
1.910538 |
19.26955 |
| 2 |
16819.60 |
1.337858 |
34.30980 |
#3 D plot
plot_ly(clustered_slowrel_unlined, x = ~sys_impervda_ft2, y = ~sys_rawstormsizemanaged_in, z = ~loading_ratio) %>%
add_markers(color = ~cluster)
Warning in RColorBrewer::brewer.pal(N, "Set2") :
minimal value for n is 3, returning requested palette with 3 different levels
Warning in RColorBrewer::brewer.pal(N, "Set2") :
minimal value for n is 3, returning requested palette with 3 different levels
Warning in RColorBrewer::brewer.pal(N, "Set2") :
minimal value for n is 3, returning requested palette with 3 different levels
Warning in RColorBrewer::brewer.pal(N, "Set2") :
minimal value for n is 3, returning requested palette with 3 different levels
Upon clustering the systems, the data will be collected in a unified
data frame for stratified sampling. The stratified sampling creates a
balanced list of systems that are reflective of the size of each smp
type and that of the internal clusters. For example, the code below has
generated 58 system (~ 10% of total unmonitored systems); the size of
each system type is proportional to that of the population, and within
each type of systems, a proportional number of systems has been chosen
from each cluster e.g., in Bioinfiltration type systems, 7 systems are
randomly chosen from cluster 3, 3 systems from cluster 2, and 2 systems
from cluster 1.
#binding all sub-type groups into one table
sys_clustered_all <- bind_rows(clustered_Bioinfiltration[,c("system_id","cluster","sys_modelinputcategory")], clustered_Bioreten_lined[,c("system_id","cluster","sys_modelinputcategory")], clustered_Bioreten_unlined[,c("system_id","cluster","sys_modelinputcategory")], clustered_slowrel_lined[,c("system_id","cluster","sys_modelinputcategory")], clustered_slowrel_unlined[,c("system_id","cluster","sys_modelinputcategory")], clustered_subsurface[,c("system_id","cluster","sys_modelinputcategory")])
#sampling code
stratified_sample <- sys_clustered_all %>%
group_by(sys_modelinputcategory, cluster) %>%
sample_frac(size=0.1)
kable(stratified_sample)
| 1023-8 |
1 |
Bioinfiltration |
| 282-3 |
1 |
Bioinfiltration |
| 1281-31 |
2 |
Bioinfiltration |
| 1315-6 |
2 |
Bioinfiltration |
| 1383-1 |
2 |
Bioinfiltration |
| 281-1 |
3 |
Bioinfiltration |
| 1138-3 |
3 |
Bioinfiltration |
| 1138-5 |
3 |
Bioinfiltration |
| 290-2 |
3 |
Bioinfiltration |
| 410-2 |
3 |
Bioinfiltration |
| 1138-2 |
3 |
Bioinfiltration |
| 416-1 |
3 |
Bioinfiltration |
| 1007-2 |
2 |
Bioretention (lined) |
| 1347-2 |
2 |
Bioretention (lined) |
| 641-5 |
1 |
Bioretention (unlined) |
| 595-8 |
2 |
Bioretention (unlined) |
| 1288-5 |
2 |
Bioretention (unlined) |
| 1198-7 |
3 |
Bioretention (unlined) |
| 1053-8 |
3 |
Bioretention (unlined) |
| 595-4 |
3 |
Bioretention (unlined) |
| 1265-9 |
3 |
Bioretention (unlined) |
| 162-2 |
3 |
Bioretention (unlined) |
| 1298-2 |
1 |
Subsurface infiltration |
| 46-1 |
1 |
Subsurface infiltration |
| 1288-10 |
1 |
Subsurface infiltration |
| 1202-10 |
1 |
Subsurface infiltration |
| 1281-41 |
1 |
Subsurface infiltration |
| 1051-7 |
1 |
Subsurface infiltration |
| 994-4 |
1 |
Subsurface infiltration |
| 1010-5 |
1 |
Subsurface infiltration |
| 1287-3 |
1 |
Subsurface infiltration |
| 280-3 |
1 |
Subsurface infiltration |
| 1298-1 |
2 |
Subsurface infiltration |
| 441-15 |
2 |
Subsurface infiltration |
| 1265-12 |
2 |
Subsurface infiltration |
| 525-2 |
1 |
Subsurface slow release (lined) |
| 1299-1 |
1 |
Subsurface slow release (lined) |
| 1202-6 |
1 |
Subsurface slow release (lined) |
| 1202-7 |
1 |
Subsurface slow release (lined) |
| 1202-2 |
1 |
Subsurface slow release (lined) |
| 1011-4 |
1 |
Subsurface slow release (lined) |
| 1279-13 |
2 |
Subsurface slow release (lined) |
| 1051-5 |
2 |
Subsurface slow release (lined) |
| 1279-2 |
2 |
Subsurface slow release (lined) |
| 1129-4 |
2 |
Subsurface slow release (lined) |
| 1307-4 |
2 |
Subsurface slow release (lined) |
| 1267-2 |
2 |
Subsurface slow release (lined) |
| 246-2 |
1 |
Subsurface slow release (unlined) |
| 443-9 |
1 |
Subsurface slow release (unlined) |
| 578-5 |
1 |
Subsurface slow release (unlined) |
| 1139-23 |
1 |
Subsurface slow release (unlined) |
| 443-10 |
1 |
Subsurface slow release (unlined) |
| 1287-4 |
1 |
Subsurface slow release (unlined) |
| 1240-1 |
2 |
Subsurface slow release (unlined) |
| 1051-3 |
2 |
Subsurface slow release (unlined) |
| 1394-1 |
2 |
Subsurface slow release (unlined) |
| 1202-9 |
2 |
Subsurface slow release (unlined) |
| 1359-5 |
2 |
Subsurface slow release (unlined) |
---
title: "SMP population assesment"
output: html_notebook
author: "Farshad Ebrahimi, Jan/30/2023"
---

The goal of this analysis is to assess the SMP population with CWL monitoring data, categorize them and find the underrepresented smp features in our data set. This is to ensure our monitoring effort results in a more comprehensive data set that covers all types of SMPs with varying design metrics such as loading ratios. The output of this analysis can assist the data collection team with their future site selection process.

```{r section 1, Loading libraries and querying smp data }
#Libraries
      library(odbc)
      library(DBI)
      library(lubridate)
      library(tidyverse)
      library(stats)
      library(gridExtra)
      library(grid)
      library(gtable)
      library(ggtext)
      library(dplyr)
      library(ggplot2)
      library(knitr)
      library(plotly)
      library(cluster)
      library(factoextra)
# DB PG14
    con <- dbConnect(odbc::odbc(), dsn = "mars14_data", uid = Sys.getenv("shiny_uid"), pwd = Sys.getenv("shiny_pwd"), MaxLongVarcharSize = 8190)

#Getting list of SMPs
    cwl_smp <- dbGetQuery(con,"SELECT DISTINCT smp_id
           FROM fieldwork.viw_deployment_full_cwl")
    
#Getting the greenit info
    smpbdv <- dbGetQuery(con,"SELECT * FROM external.tbl_smpbdv")
    sysbdv <- dbGetQuery(con,"SELECT * FROM external.tbl_systembdv")
    greenit_unified <-dbGetQuery(con,"SELECT * FROM external.viw_greenit_unified")

# join the greenit table with our smp list
    smp_features <- cwl_smp %>% 
      inner_join(smpbdv, by="smp_id")
# Unmonitored SMPs
    unmonitored_smp <- read.csv("C:\\Users\\Farshad.Ebrahimi\\OneDrive - City of Philadelphia\\Github Projects\\smp-population-assesment\\unmonitored_sites.csv") %>%
      select(smp_id = SMP.ID) %>%
      distinct()
# unmonitored smp_features
    unmonitored_smp_features <- unmonitored_smp %>%
      inner_join(smpbdv, by="smp_id")
```

The following code chunk calculates the break-down of SMP types with CWL data.

```{r section 2, Monitored SMP break-down based on type}
#number of smp types
smp_type_break <- smp_features %>% 
  group_by(smp_smptype) %>%
  summarise(count = n()) %>%
  select(Type = smp_smptype, count)
```

```{r section 3 pie chart of monitored SMPs, echo=False}
ggplot(smp_type_break, aes(x="", y= count, fill=Type))+geom_bar(width = 1, stat = "identity")+coord_polar("y", start=0)
```

```{r section 4, looking at the unmonitored SMPs to see the break-down}
#number of unmonitored smps
unmonitored_smp_type_break <- unmonitored_smp_features %>% 
  group_by(smp_smptype) %>%
  summarise(count = n()) %>%
  select(Type = smp_smptype, count)
```

The following Pie chart shows the break-down of all SMPs (monitored and unmonitored).

```{r setion 5 pie chart of unmonitored SMPs, echo=FALSE}
ggplot(unmonitored_smp_type_break, aes(x="", y= count, fill=Type))+geom_bar(width = 1, stat = "identity")+coord_polar("y", start=0)
```

```{r section 6 full join the unmonitored and monitored, echo=FALSE}
#full join the unmonitored and monirored and create a column to sum them up
smp_breakdwon_full <- smp_type_break %>%
  full_join(unmonitored_smp_type_break, by="Type") %>%
  select(Type, count_monitored = count.x, count_unmonitored = count.y)

#replace na with zero
smp_breakdwon_full[is.na(smp_breakdwon_full)] <- 0

#mutate the third column
smp_breakdwon_full <- smp_breakdwon_full %>%
  mutate(sum_all = count_monitored + count_unmonitored)

#data frame for plotting (only two columns)
pivot_df <- smp_breakdwon_full %>%
  select(Type, count_monitored, sum_all)

df_longer <- pivot_longer(pivot_df, cols = c("count_monitored", "sum_all"), names_to = "category", values_to = "count")

#Plot
ggplot(df_longer,aes(x = Type, y = count, fill= category)) +
      geom_bar(stat="identity", position=position_dodge())+
      theme(axis.text=element_text(size=6, angle = 35),
        axis.title=element_text(size=10))

```

```{r section 7, adding the monitored and unmoniored}
#combining all smps (monitored and unmonitored) and get fractions based on type of smp-finally normalize them
total_smp <- unmonitored_smp_type_break %>%
  full_join(smp_type_break, by="Type")
total_smp[is.na(total_smp)] <- 0
total_smp <- total_smp %>%
  mutate(total_number = count.x+count.y) %>%
  select(Type, total_number)
total_smp <- total_smp %>%
    mutate(sum_all = sum(total_number)) %>%
    mutate(percent_all = (total_number/sum_all)*100)

#get the percent of monitored SMP
smp_type_break_fraction <- smp_type_break %>%
    mutate(sum_all = sum(count)) %>%
    mutate(percent_monitored = (count/sum_all)*100)

#normalize the monitored smp by the total smp percents
normalized_fractions <- smp_type_break_fraction %>%
  full_join(total_smp, by="Type") %>%
  mutate(normalized_percent = percent_monitored/percent_all) %>%
  select(Type, percent_monitored, percent_all, normalized_percent)

normalized_fractions[is.na(normalized_fractions)] <- 0
```

The following table is aimed to identify the over-representing (normalized percent \> 1) and under-representing (normalized percent \<1) SMP types.

```{r}
#calculating the normalized percentage of smp types
kable(arrange(normalized_fractions, normalized_percent), caption = "Normalized SMP Break-down")
```

The table above shows Tree trenches are over-represented in our CWL data, while green roof, stormwater tree, bumpout, swale and planters are under-represented (due to monitoring limitation, etc).

The following section is focused on establishing the design metrics range for SMPs in the unmonitored SMP list. These design metrics include drainage area, storm size managed and loading ratio.

```{r section 8, rawstormsizemanaged-in , sys_impervda_ft2, and loading ratio metrics gathering}
#rawstormsizemanaged-in , sys_impervda_ft2, and loading ratio metrics gathering

table_all <- unmonitored_smp_features %>%
  left_join(sysbdv, by="system_id") %>%
  mutate(loading_ratio = sys_lrtotalda_ft2 )
```

```{r section 9}
#Clustring analysis based on 3 features of drainage area, loading ratio, and storm sized managed

clustered_moniored <- table_all %>%
  select(system_id, sys_impervda_ft2, loading_ratio, sys_rawstormsizemanaged_in) %>%
  na.omit() %>%
  distinct()

#normalze using mean-sd standardization
normalized_cluster <- clustered_moniored
normalized_cluster[,2:4] <- scale(clustered_moniored[,2:4])

#Elbow method shows a subtle elbow at cluster 4-hence 4 clusters is optimum
#number of clusters
fviz_nbclust(normalized_cluster[,2:4], kmeans, method = "wss")

#k-means 
# Cluster using kmeans with five clusters
cluster_solution <- kmeans(normalized_cluster[,2:4], centers = 4)

# Store the cluster assignments back into the clustering data frame object
clustered_moniored$cluster <-factor(cluster_solution$cluster) 

# Look at the distribution of cluster assignments
table(clustered_moniored$cluster)
```

```{r section 10 analyzing the cluster features}
# Group by the cluster assignment and calculate averages
sys_clus_avg <- clustered_moniored %>%
    group_by(cluster) %>%
    summarize_if(is.numeric, mean)

# View the resulting table
kable(sys_clus_avg)
```

```{r 2D Plot}
ggplot(data = clustered_moniored, aes(x = loading_ratio, y = sys_rawstormsizemanaged_in, color = cluster)) +
    geom_point()
```

```{r 3D plot}
plot_ly(clustered_moniored, x = ~sys_impervda_ft2, y = ~loading_ratio, z = ~sys_rawstormsizemanaged_in) %>%
  add_markers(color = ~cluster)
```

In order to have a more relevant clustering method that distinguishes between general types of systems (lined vs unlined or bioretention vs bio infiltration), the unmonitored SMPs were devided into broad categories of Bioinfiltration, Bioretention (lined), Bioretention (unlined), Subsurface infiltration, Subsurface slow release (lined), and Subsurface slow release (unlined). This is done by a system-level feature called sys_modelinputcategory. The code chunk below shows a break down of these SMPs. Please note that green roof and permeable pavements are not included due to the low number.

```{r section 11 sys_modelinputcategory is used to categorize systems}
#sys_modelinputcategory categories
sys_cat <- table_all %>%
  select(system_id, sys_modelinputcategory)%>%
  distinct %>%
  group_by(sys_modelinputcategory) %>%
  summarise(count = n())

kable(sys_cat)
```

```{r section 12 clustering Bioinfiltration type systems }

clustered_Bioinfiltration <- table_all %>%
  filter(sys_modelinputcategory == "Bioinfiltration") %>%
  select(system_id, sys_impervda_ft2, loading_ratio, sys_rawstormsizemanaged_in, sys_modelinputcategory) %>%
  na.omit() %>%
  distinct()

#normalze using mean-sd standardization
normalized_cluster <- clustered_Bioinfiltration
normalized_cluster[,2:4] <- scale(clustered_Bioinfiltration[,2:4])

#Elbow method shows a subtle elbow at cluster 4-hence 4 clusters is optimum
#number of clusters
fviz_nbclust(normalized_cluster[,2:4], kmeans, method = "wss")

#k-means 
# Cluster using kmeans with five clusters
cluster_solution <- kmeans(normalized_cluster[,2:4], centers = 4)

# Store the cluster assignments back into the clustering data frame object
clustered_Bioinfiltration$cluster <-factor(cluster_solution$cluster) 

# Look at the distribution of cluster assignments
table(clustered_Bioinfiltration$cluster)

# Group by the cluster assignment and calculate averages
sys_clus_avg <- clustered_Bioinfiltration %>%
    group_by(cluster) %>%
    summarize_if(is.numeric, mean)

# View the resulting table
kable(sys_clus_avg)

#3 D plot
plot_ly(clustered_Bioinfiltration, x = ~sys_impervda_ft2, y = ~loading_ratio, z = ~sys_rawstormsizemanaged_in) %>%
  add_markers(color = ~cluster)
```

Bioretention (lined) clustering. Note that loading ratio field is not applicable here.

```{r section 13 Bioretention lined clustering}

clustered_Bioreten_lined <- table_all %>%
  filter(sys_modelinputcategory == "Bioretention (lined)") %>%
  select(system_id, sys_impervda_ft2, sys_rawstormsizemanaged_in, sys_modelinputcategory) %>%
  na.omit() %>%
  distinct()

#normalze using mean-sd standardization
normalized_cluster <- clustered_Bioreten_lined
normalized_cluster[,2:3] <- scale(clustered_Bioreten_lined[,2:3])

#Elbow method shows a subtle elbow at cluster 4-hence 4 clusters is optimum
#number of clusters
fviz_nbclust(normalized_cluster[,2:3], kmeans, method = "wss")

#k-means 
# Cluster using kmeans with five clusters
cluster_solution <- kmeans(normalized_cluster[,2:3], centers = 2)

# Store the cluster assignments back into the clustering data frame object
clustered_Bioreten_lined$cluster <-factor(cluster_solution$cluster) 

# Look at the distribution of cluster assignments
table(clustered_Bioreten_lined$cluster)

# Group by the cluster assignment and calculate averages
sys_clus_avg <- clustered_Bioreten_lined %>%
    group_by(cluster) %>%
    summarize_if(is.numeric, mean)

# View the resulting table
kable(sys_clus_avg)

#3 D plot
plot_ly(clustered_Bioreten_lined, x = ~sys_impervda_ft2, y = ~sys_rawstormsizemanaged_in) %>%
  add_markers(color = ~cluster)
```

Bioretention (unlined) clustering.

```{r section 14 Bioretention unlined clustering}

clustered_Bioreten_unlined <- table_all %>%
  filter(sys_modelinputcategory == "Bioretention (unlined)") %>%
  select(system_id, sys_impervda_ft2, sys_rawstormsizemanaged_in, loading_ratio, sys_modelinputcategory) %>%
  na.omit() %>%
  distinct()

#normalze using mean-sd standardization
normalized_cluster <- clustered_Bioreten_unlined
normalized_cluster[,2:4] <- scale(clustered_Bioreten_unlined[,2:4])

#Elbow method shows a subtle elbow at cluster 4-hence 4 clusters is optimum
#number of clusters
fviz_nbclust(normalized_cluster[,2:4], kmeans, method = "wss")

#k-means 
# Cluster using kmeans with five clusters
cluster_solution <- kmeans(normalized_cluster[,2:4], centers = 3)

# Store the cluster assignments back into the clustering data frame object
clustered_Bioreten_unlined$cluster <-factor(cluster_solution$cluster) 

# Look at the distribution of cluster assignments
table(clustered_Bioreten_unlined$cluster)

# Group by the cluster assignment and calculate averages
sys_clus_avg <- clustered_Bioreten_unlined %>%
    group_by(cluster) %>%
    summarize_if(is.numeric, mean)

# View the resulting table
kable(sys_clus_avg)

#3 D plot
plot_ly(clustered_Bioreten_unlined, x = ~sys_impervda_ft2, y = ~sys_rawstormsizemanaged_in, z= ~loading_ratio) %>%
  add_markers(color = ~cluster)
```

```{r section 15 Subsurface infiltration clustering}

clustered_subsurface <- table_all %>%
  filter(sys_modelinputcategory == "Subsurface infiltration") %>%
  select(system_id, sys_impervda_ft2, sys_rawstormsizemanaged_in, loading_ratio, sys_modelinputcategory) %>%
  na.omit() %>%
  distinct()

#normalze using mean-sd standardization
normalized_cluster <- clustered_subsurface
normalized_cluster[,2:4] <- scale(clustered_subsurface[,2:4])

#Elbow method shows a subtle elbow at cluster 4-hence 4 clusters is optimum
#number of clusters
fviz_nbclust(normalized_cluster[,2:4], kmeans, method = "wss")

#k-means 
# Cluster using kmeans with five clusters
cluster_solution <- kmeans(normalized_cluster[,2:4], centers = 2)

# Store the cluster assignments back into the clustering data frame object
clustered_subsurface$cluster <-factor(cluster_solution$cluster) 

# Look at the distribution of cluster assignments
table(clustered_subsurface$cluster)

# Group by the cluster assignment and calculate averages
sys_clus_avg <- clustered_subsurface %>%
    group_by(cluster) %>%
    summarize_if(is.numeric, mean)

# View the resulting table
kable(sys_clus_avg)

#3 D plot
plot_ly(clustered_subsurface, x = ~sys_impervda_ft2, y = ~sys_rawstormsizemanaged_in, z= ~loading_ratio) %>%
  add_markers(color = ~cluster)
```

```{r section 16 Subsurface slow release (lined)	 clustering}

clustered_slowrel_lined <- table_all %>%
  filter(sys_modelinputcategory == "Subsurface slow release (lined)") %>%
  select(system_id, sys_impervda_ft2, sys_rawstormsizemanaged_in, sys_modelinputcategory) %>%
  na.omit() %>%
  distinct()

#normalze using mean-sd standardization
normalized_cluster <- clustered_slowrel_lined
normalized_cluster[,2:3] <- scale(clustered_slowrel_lined[,2:3])

#Elbow method shows a subtle elbow at cluster 4-hence 4 clusters is optimum
#number of clusters
fviz_nbclust(normalized_cluster[,2:3], kmeans, method = "wss")

#k-means 
# Cluster using kmeans with five clusters
cluster_solution <- kmeans(normalized_cluster[,2:3], centers = 2)

# Store the cluster assignments back into the clustering data frame object
clustered_slowrel_lined$cluster <-factor(cluster_solution$cluster) 

# Look at the distribution of cluster assignments
table(clustered_slowrel_lined$cluster)

# Group by the cluster assignment and calculate averages
sys_clus_avg <- clustered_slowrel_lined %>%
    group_by(cluster) %>%
    summarize_if(is.numeric, mean)

# View the resulting table
kable(sys_clus_avg)

#3 D plot
plot_ly(clustered_slowrel_lined, x = ~sys_impervda_ft2, y = ~sys_rawstormsizemanaged_in) %>%
  add_markers(color = ~cluster)


```

```{r section 17 Subsurface slow release (unlined)	}

clustered_slowrel_unlined <- table_all %>%
  filter(sys_modelinputcategory == "Subsurface slow release (unlined)") %>%
  select(system_id, sys_impervda_ft2, sys_rawstormsizemanaged_in, loading_ratio, sys_modelinputcategory) %>%
  na.omit() %>%
  distinct()

#normalze using mean-sd standardization
normalized_cluster <- clustered_slowrel_unlined
normalized_cluster[,2:3] <- scale(clustered_slowrel_unlined[,2:3])

#Elbow method shows a subtle elbow at cluster 4-hence 4 clusters is optimum
#number of clusters
fviz_nbclust(normalized_cluster[,2:3], kmeans, method = "wss")

#k-means 
# Cluster using kmeans with five clusters
cluster_solution <- kmeans(normalized_cluster[,2:3], centers = 2)

# Store the cluster assignments back into the clustering data frame object
clustered_slowrel_unlined$cluster <-factor(cluster_solution$cluster) 

# Look at the distribution of cluster assignments
table(clustered_slowrel_unlined$cluster)

# Group by the cluster assignment and calculate averages
sys_clus_avg <- clustered_slowrel_unlined %>%
    group_by(cluster) %>%
    summarize_if(is.numeric, mean)

# View the resulting table
kable(sys_clus_avg)

#3 D plot
plot_ly(clustered_slowrel_unlined, x = ~sys_impervda_ft2, y = ~sys_rawstormsizemanaged_in, z = ~loading_ratio) %>%
  add_markers(color = ~cluster)
```

Upon clustering the systems, the data will be collected in a unified data frame for stratified sampling. The stratified sampling creates a balanced list of systems that are reflective of the size of each smp type and that of the internal clusters. For example, the code below has generated 58 system (\~ 10% of total unmonitored systems); the size of each system type is proportional to that of the population, and within each type of systems, a proportional number of systems has been chosen from each cluster e.g., in Bioinfiltration type systems, 7 systems are randomly chosen from cluster 3, 3 systems from cluster 2, and 2 systems from cluster 1.

```{r section 18 sampling}
#binding all sub-type groups into one table
sys_clustered_all <- bind_rows(clustered_Bioinfiltration[,c("system_id","cluster","sys_modelinputcategory")], clustered_Bioreten_lined[,c("system_id","cluster","sys_modelinputcategory")], clustered_Bioreten_unlined[,c("system_id","cluster","sys_modelinputcategory")], clustered_slowrel_lined[,c("system_id","cluster","sys_modelinputcategory")], clustered_slowrel_unlined[,c("system_id","cluster","sys_modelinputcategory")], clustered_subsurface[,c("system_id","cluster","sys_modelinputcategory")])

#sampling code
stratified_sample <- sys_clustered_all %>%
    group_by(sys_modelinputcategory, cluster) %>%
    sample_frac(size=0.1)

kable(stratified_sample)
```
